In [396]:
################ IMPORTS ################

import pandas as pd
import numpy as np
import sklearn as sk
from IPython.display import display
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

# Data Visualization

import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')

%matplotlib inline
sns.set(style='white', context='notebook', palette='deep') 
import matplotlib.style as style
style.use('fivethirtyeight')
pd.set_option('display.max_columns',999)
In [397]:
############## DATA IMPORT ##############

train = pd.read_csv('../data/house_prices_dataset.csv')
display(train.sample(5))
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
1096 1097 70 RM 60.0 6882 Pave NaN Reg Lvl AllPub Inside Gtl IDOTRR Norm Norm 1Fam 2Story 6 7 1914 2006 Gable CompShg Wd Sdng Wd Sdng NaN 0.0 TA TA PConc TA TA No Unf 0 Unf 0 684 684 GasA TA Y SBrkr 773 582 0 1355 0 0 1 1 3 1 Gd 7 Typ 0 NaN NaN NaN NaN 0 0 NaN NaN Y 136 0 115 0 0 0 NaN NaN NaN 0 3 2007 WD Normal 127000
752 753 20 RL 79.0 9236 Pave NaN IR1 Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 1Story 6 5 1997 1997 Gable CompShg VinylSd VinylSd NaN 0.0 TA Gd PConc Gd TA Gd GLQ 1200 Unf 0 279 1479 GasA Ex Y SBrkr 1494 0 0 1494 1 0 2 0 3 1 Gd 6 Typ 0 NaN Attchd 1997.0 RFn 2 576 TA TA Y 168 27 0 0 0 0 NaN NaN NaN 0 7 2006 WD Normal 217000
831 832 160 FV 30.0 3180 Pave Pave Reg Lvl AllPub Inside Gtl Somerst Norm Norm TwnhsE 2Story 7 5 2005 2005 Gable CompShg MetalSd MetalSd NaN 0.0 TA TA PConc Gd TA No Unf 0 Unf 0 600 600 GasA Ex Y SBrkr 520 600 80 1200 0 0 2 1 2 1 Gd 4 Typ 0 NaN Detchd 2005.0 RFn 2 480 TA TA Y 0 166 0 0 0 0 NaN NaN NaN 0 6 2006 WD Normal 151000
847 848 20 RL 36.0 15523 Pave NaN IR1 Lvl AllPub CulDSac Gtl CollgCr Norm Norm 1Fam 1Story 5 6 1972 1972 Gable CompShg HdBoard Plywood NaN 0.0 TA TA CBlock TA TA Av BLQ 460 Unf 0 404 864 GasA Ex Y SBrkr 864 0 0 864 1 0 1 0 3 1 TA 5 Typ 1 Fa Attchd 1972.0 Unf 1 338 TA TA Y 0 0 0 0 0 0 NaN NaN NaN 0 8 2009 WD Normal 133500
220 221 20 RL 73.0 8990 Pave NaN IR1 Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 1Story 7 5 2006 2006 Gable CompShg VinylSd VinylSd NaN 0.0 Gd TA PConc Gd TA Mn Unf 0 Unf 0 1498 1498 GasA Ex Y SBrkr 1498 0 0 1498 0 0 2 0 2 1 Gd 5 Typ 0 NaN Attchd 2006.0 RFn 2 675 TA TA Y 351 33 0 0 0 0 NaN NaN NaN 0 4 2006 New Partial 204900
In [398]:
############## DATA TYPES ###############

train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallCond    1460 non-null   int64  
 19  YearBuilt      1460 non-null   int64  
 20  YearRemodAdd   1460 non-null   int64  
 21  RoofStyle      1460 non-null   object 
 22  RoofMatl       1460 non-null   object 
 23  Exterior1st    1460 non-null   object 
 24  Exterior2nd    1460 non-null   object 
 25  MasVnrType     588 non-null    object 
 26  MasVnrArea     1452 non-null   float64
 27  ExterQual      1460 non-null   object 
 28  ExterCond      1460 non-null   object 
 29  Foundation     1460 non-null   object 
 30  BsmtQual       1423 non-null   object 
 31  BsmtCond       1423 non-null   object 
 32  BsmtExposure   1422 non-null   object 
 33  BsmtFinType1   1423 non-null   object 
 34  BsmtFinSF1     1460 non-null   int64  
 35  BsmtFinType2   1422 non-null   object 
 36  BsmtFinSF2     1460 non-null   int64  
 37  BsmtUnfSF      1460 non-null   int64  
 38  TotalBsmtSF    1460 non-null   int64  
 39  Heating        1460 non-null   object 
 40  HeatingQC      1460 non-null   object 
 41  CentralAir     1460 non-null   object 
 42  Electrical     1459 non-null   object 
 43  1stFlrSF       1460 non-null   int64  
 44  2ndFlrSF       1460 non-null   int64  
 45  LowQualFinSF   1460 non-null   int64  
 46  GrLivArea      1460 non-null   int64  
 47  BsmtFullBath   1460 non-null   int64  
 48  BsmtHalfBath   1460 non-null   int64  
 49  FullBath       1460 non-null   int64  
 50  HalfBath       1460 non-null   int64  
 51  BedroomAbvGr   1460 non-null   int64  
 52  KitchenAbvGr   1460 non-null   int64  
 53  KitchenQual    1460 non-null   object 
 54  TotRmsAbvGrd   1460 non-null   int64  
 55  Functional     1460 non-null   object 
 56  Fireplaces     1460 non-null   int64  
 57  FireplaceQu    770 non-null    object 
 58  GarageType     1379 non-null   object 
 59  GarageYrBlt    1379 non-null   float64
 60  GarageFinish   1379 non-null   object 
 61  GarageCars     1460 non-null   int64  
 62  GarageArea     1460 non-null   int64  
 63  GarageQual     1379 non-null   object 
 64  GarageCond     1379 non-null   object 
 65  PavedDrive     1460 non-null   object 
 66  WoodDeckSF     1460 non-null   int64  
 67  OpenPorchSF    1460 non-null   int64  
 68  EnclosedPorch  1460 non-null   int64  
 69  3SsnPorch      1460 non-null   int64  
 70  ScreenPorch    1460 non-null   int64  
 71  PoolArea       1460 non-null   int64  
 72  PoolQC         7 non-null      object 
 73  Fence          281 non-null    object 
 74  MiscFeature    54 non-null     object 
 75  MiscVal        1460 non-null   int64  
 76  MoSold         1460 non-null   int64  
 77  YrSold         1460 non-null   int64  
 78  SaleType       1460 non-null   object 
 79  SaleCondition  1460 non-null   object 
 80  SalePrice      1460 non-null   int64  
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB
In [399]:
############## CHANGE DATA TYPES ##############

# After checking which data types, we should change categories to objects
# We can populate the object_train list below with the corresponding column names:

# Convert data to object
# object_train = []
# train[object_train] = train[object_train].astype('object')

# We should also convert data to numeric, especially to float64 for good measure 

# Convert data to numeric
# numeric_train = train.select_dtypes(include=[np.number]).columns.tolist()
# train[numeric_train] = train[numeric_train].astype('float64')
In [400]:
############## ASSESS MISSING DATA ##############

variables = train.columns

count = []

for variable in variables:
    length = train[variable].count()
    count.append(length)                                # This will populate count with all variable counts
    
count_pct = np.round(100 * pd.Series(count) / len(train), 2)
count = pd.Series(count)

missing = pd.DataFrame()                                # Initiates the dataframe
missing['variables'] = variables                        # Populates it with the variables list
missing['count'] = len(train) - count       
missing['count_pct'] = 100 - count_pct
missing = missing[missing['count_pct'] > 0]             # If missing % = 0, variable will be removed from this list 
missing.sort_values(by=['count_pct'], inplace=True)     # Will sort by % of missing values
missing_train = np.array(missing['variables'])          # And make the array

display(missing)
variables count count_pct
42 Electrical 1 0.07
26 MasVnrArea 8 0.55
30 BsmtQual 37 2.53
31 BsmtCond 37 2.53
33 BsmtFinType1 37 2.53
32 BsmtExposure 38 2.60
35 BsmtFinType2 38 2.60
64 GarageCond 81 5.55
63 GarageQual 81 5.55
60 GarageFinish 81 5.55
59 GarageYrBlt 81 5.55
58 GarageType 81 5.55
3 LotFrontage 259 17.74
57 FireplaceQu 690 47.26
25 MasVnrType 872 59.73
73 Fence 1179 80.75
6 Alley 1369 93.77
74 MiscFeature 1406 96.30
72 PoolQC 1453 99.52
In [402]:
############ PLOT GRAPHS WITH MISSING DATA ##############

# Create a figure and axes
fig, axes = plt.subplots(1, 2, figsize=(15,6))

# Plots missing data in percentage
axes[0].barh(missing['variables'], missing['count_pct'])
axes[0].set_title('Count of missing training data in percent', fontsize=10)

# Plots total row number of missing data
axes[1].barh(missing['variables'], missing['count'])
axes[1].set_title('Count of missing training data as total records', fontsize=10)
Out[402]:
Text(0.5, 1.0, 'Count of missing training data as total records')
No description has been provided for this image

From the graphs above, one can have an idea of some of the variables that have missing data. One should take care to handle those issues:

High proportion (>40%) - consider removing the feature Moderate proportion (5 to 40%) - consider different imputation procedures (Mean, Median, Mode, Predictive, KNN, etc.) Low proportion (<5%) - consider simple imputation (Mean, Median), or remove the rows with missing values

In [ ]:
############ CODE EXAMPLES ##############

# drop columns with missing values > 40%
#
# threshold = 0.4 * len(train)
# train_clean = train.dropna(axis=1, thresh=threshold)

# Mean Imputation - Replace missing values with the mean (does not work with non numerical values)
#
# imputer = SimpleImputer(strategy='mean')
# train_imputed = pd.DataFrame(imputer.fit_transform(train), columns=train.columns)

# Impute missing values using KNN
#
# imputer = KNNImputer(n_neighbors=5)
# train_imputed = pd.DataFrame(imputer.fit_transform(train), columns=train.columns)
In [ ]:
########### GET FEATURES AND TARGET #############

# Get target
target = 'SalePrice'

# Get quantitative features and delete the unnecessary features
quantitative = [f for f in train.columns if train.dtypes[f] != 'object']
quantitative.remove('SalePrice')                # Remove target 
quantitative.remove('Id')                       # Remove ID 

# Get categorical features
categorical = [f for f in train.columns if train.dtypes[f] == 'object']
In [ ]:
########### CREATE HISTOGRAM OF TARGET ###########

sns.histplot(train[target], kde=True, line_kws={'linewidth': 1})
plt.title('Target Distribution with KDE')
No description has been provided for this image
In [ ]:
########### HISTOGRAMS OF QUANTITATIVE FEATURES ###########

# Assuming DataFrame 'train' and 'quantitative' is the list of columns
f = pd.melt(train, value_vars=quantitative)

# Create a FacetGrid for plotting
g = sns.FacetGrid(f, col="variable", col_wrap=3, sharex=False, sharey=False, height=5)

# Map the histograms with KDE
g.map(sns.histplot, "value", kde=True, line_kws={'linewidth': 1})
g.set_titles(col_template="{col_name}")
g.set(yticks=[], ylabel='')
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x169235e80>
No description has been provided for this image
In [ ]:
########### SCATTERPLOTS OF QUANTITATIVE FEATURES VS TARGET ###########

# Assuming DataFrame 'train' and 'quantitative' is the list of columns
f = pd.melt(train, id_vars=[target], value_vars=quantitative)

# Create a FacetGrid for plotting
g = sns.FacetGrid(f, col="variable", col_wrap=3, sharex=False, sharey=False, height=5)

# Map regplots
g.map(sns.regplot, "value", target, scatter_kws={'s': 10}, line_kws={'linewidth': 1})
g.set_titles(col_template="{col_name}")

# Format y-axis ticks to use 'k' for thousands
def thousand_formatter(x, pos):
    return f'{int(x * 1e-3)}k'

for ax in g.axes.flatten():
    ax.yaxis.set_major_formatter(FuncFormatter(thousand_formatter))
No description has been provided for this image

By looking at the above graphs, one should look for correlations or patterns between the variables. A clear trend might suggest a relationship. Clusters: Check if there are natural groupings or clusters in the data. Outliers: Identify any points that deviate significantly from the general trend.

In [ ]:
########### QUALITATIVE FEATURES ###########

num_features = len(categorical)
num_cols = 3                                                                # Number of columns in the grid
num_rows = (num_features // num_cols) + int(num_features % num_cols > 0)    # Calculate number of rows needed

fig, axes = plt.subplots(num_rows, num_cols, figsize=(18, 4 * num_rows))    # Adjust figsize as needed
axes = axes.flatten()                                                       # Flatten in case of a 2D array of axes

palette = sns.color_palette("husl", 10)                                     # Change the number of colors as needed

for i, feature in enumerate(categorical):
    sns.countplot(data=subset_df, x=feature, ax=axes[i], palette=sns.color_palette("husl"))
    axes[i].tick_params(axis='x', rotation=90)
    axes[i].set_ylabel('')

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
No description has been provided for this image
In [ ]:
########### QUALITATIVE FEATURES BOXPLOTS ###########


num_features = len(categorical)
num_cols = 3                                                                # Number of columns in the grid
num_rows = (num_features // num_cols) + int(num_features % num_cols > 0)    # Calculate number of rows needed

fig, axes = plt.subplots(num_rows, num_cols, figsize=(18, 4 * num_rows))    # Adjust figsize as needed

axes = axes.flatten()                                                       # Flatten in case of a 2D array of axes

palette = sns.color_palette("husl", 10)                                     # Change the number of colors as needed

def thousand_formatter(x, pos):
    return f'{int(x * 1e-3)}k'

for i, feature in enumerate(categorical):
    sns.boxplot(data=subset_df, x=feature, y=target, ax=axes[i], palette=palette, fliersize=3)
    axes[i].tick_params(axis='x', rotation=90)
    axes[i].set_ylabel('SalePrice')
    axes[i].yaxis.set_major_formatter(FuncFormatter(thousand_formatter))
    
# Hide any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
No description has been provided for this image
In [ ]:
########### CORRELATION MATRIX ###########

d = train[quantitative]
corr = d.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, 
            square=True, linewidths=.5, annot=False, cmap=cmap)
plt.yticks(rotation=0)
plt.title('Correlation Matrix of all Numerical Variables')
Out[ ]:
Text(0.5, 1.0, 'Correlation Matrix of all Numerical Variables')
No description has been provided for this image
In [ ]:
########### CORRELATION OF NUMERIC FEATURES WITH TARGET ###########

d = train[quantitative + [target]]

# Calculate correlation of each numeric feature with the target variable and drop the target's correlation with itself
correlations = d.corr()[target].drop(target)  

# Sort the correlations by absolute value for better visualization
correlations = correlations.sort_values(ascending=False, key=abs)

# Plotting
plt.figure(figsize=(10, 8))
sns.barplot(x=correlations.values, y=correlations.index, palette="viridis")
plt.title(f'\nCorrelation of Numeric Features with {target}\n', size=16)
plt.xlabel('\nCorrelation Coefficient', size=16)
plt.ylabel('Features', size=16)
Out[ ]:
Text(0, 0.5, 'Features')
No description has been provided for this image
In [ ]: